import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import re
from collections import defaultdict
from sklearn.metrics import cohen_kappa_score
from typing import List

"""
This file is to calculate the cohen agreement between our own codes and the peer codes.

@author: Jonathan van Oudheusden
@date: 2024-01-28

Required files: 
    - combinedCodes.xlsx
    generated by combinePeerCoding.py
    needs columns 'My codes', 'PeerCodes'
    
Output files:
    - prints in the terminal
"""

combinedCodes = 'combinedCodes.xlsx'
dfAllCodes = pd.read_excel(combinedCodes)


def splitCodes(ownCodes):
    ownCodesSplit = re.split(r'\s*,\s*', ownCodes)
    return [code.strip() for code in ownCodesSplit]


equalsCount = 0
notEqualCount = 0

allCodesDict = defaultdict(lambda: [[0,0], 99])
for index, row in dfAllCodes.iterrows():
    mycodes = row['My codes']
    peerCodes = row['PeerCodes']

    if (not(pd.isnull(mycodes)) and mycodes.strip() != ''):
        # add all own codes to dict with count
        for code in splitCodes(mycodes):
            if (code != ''):
                allCodesDict[code][0][0] += 1
        # add all peer codes to dict with count
        for peerCode in splitCodes(peerCodes):
            if (code != ''):
                allCodesDict[peerCode][0][1] += 1


# turn dict of codes into list
allCodesList: List[str] = []
for code, count in allCodesDict.items():
    if (code != ''):
        allCodesList.append(code)


print(f"Total amount of codes {len(allCodesDict)}") 

# start reformatting the codes into a matrix of 0,1, 
# where 1 means that that specific code was used in that row

# Remove rows where column 'My codes' has empty strings or NaN values
dfAllCodesFiltered = dfAllCodes[dfAllCodes['My codes'].astype(str) != '']  # Remove empty strings
dfAllCodesFiltered = dfAllCodesFiltered.dropna(subset=['My codes'])  # Remove NaN values

removedCodes = [
    'disabilities', 'contemplating quitting', 'wants to quit smoking', 
    'change environment', 'indifferent about activity', 'sleep has effect', 
    'weather', 'struggling with stopping', 'Participation', 'Mindset is important', 
    'thinking about habits', 'need for motivation', 'positive mindset',
    'learned new things', 'used tips when getting the itch to smoke', 'positive in ability to quit',
    'reflected on past', 'improved mindset', 
                ]

allCodesList = [item for item in allCodesList if item not in set(removedCodes)]
# add entrys for kappa with multilabel
for code in allCodesList:
    dfAllCodesFiltered.loc[:, code] = 0
    dfAllCodesFiltered.loc[:, "Peer " + code] = 0

dfAllCodesFilteredcopy = dfAllCodesFiltered.copy()
for index, row in dfAllCodesFilteredcopy.iterrows():
    mycodes = row['My codes']
    peerCodes = row['PeerCodes']

    for code in splitCodes(mycodes):
        dfAllCodesFilteredcopy.loc[index, code] = 1

    # add all peer codes to dict with count
    for peerCode in splitCodes(peerCodes):
        dfAllCodesFilteredcopy.loc[index, 'Peer ' + peerCode] = 1

# calculate per code kappa
for code in allCodesList:
    kappa = cohen_kappa_score(dfAllCodesFilteredcopy[code], 
                              dfAllCodesFilteredcopy['Peer ' + code])
    allCodesDict[code][1] = kappa

# print codes in order of most used to least used
sorted_dict = dict(sorted(allCodesDict.items(), key=lambda item: sum(item[1][0]), reverse=True))
for code, count in sorted_dict.items():
    if(count[1] != 99):
        print(f"{code}, Count: {count[0]} kappa: {count[1]}")

### calulate full cohen kappa value
allCodesListWithPeer = ['Peer ' + s for s in allCodesList]

dfMyCodesMatrix = dfAllCodesFilteredcopy[allCodesList].copy()
dfPeerCodesMatrix = dfAllCodesFilteredcopy[allCodesListWithPeer].copy()


dfMyCodesMatrix['Combined'] = dfMyCodesMatrix.astype(str).apply(''.join, axis=1)
dfPeerCodesMatrix['Combined'] = dfPeerCodesMatrix.astype(str).apply(''.join, axis=1)
print("final cohen ")
print(cohen_kappa_score(dfMyCodesMatrix['Combined'], dfPeerCodesMatrix['Combined']))

